raw_train <- read_csv(
"../../data/raw/train_data.txt",
col_types = cols(perimetro_encefalico = col_number())
)
raw_test <- read_csv(
"../../data/raw/test_data.txt",
col_types = cols(perimetro_encefalico = col_number())
)
df_salud <- bind_rows(raw_train, raw_test)
raw_train %>% glimpse
Observations: 43,933
Variables: 23
$ BMIZ <dbl> 2.1740218, 2.9977233, 2.3279958, -0.5328662, -0.5228759, -0.3...
$ HAZ <dbl> -1.03324437, -1.30227085, -0.54952410, -2.19561134, -0.507069...
$ WAZ <dbl> 0.9507068, 1.4414035, 1.4729592, -1.6518444, -0.6833294, -0.3...
$ individuo <int> 26316, 26316, 26316, 21124, 21124, 21124, 21124, 21127, 21127...
$ bmi <dbl> 19.85226, 21.83281, 21.00399, 16.15882, 16.56805, 16.80319, 1...
$ departamento_indec_id <int> 882, 882, 882, 274, 274, 274, 274, 28, 28, 28, 357, 357, 357,...
$ departamento_lat <dbl> -34.09624, -34.09624, -34.09624, -34.79435, -34.79435, -34.79...
$ departamento_long <dbl> -59.02863, -59.02863, -59.02863, -58.26468, -58.26468, -58.26...
$ fecha_control <date> 2013-09-20, 2013-10-17, 2014-03-07, 2013-10-16, 2013-12-18, ...
$ fecha_nacimiento <date> 2013-07-15, 2013-07-15, 2013-07-15, 2013-07-16, 2013-07-16, ...
$ fecha_proximo_control <date> 2013-10-17, 2014-03-07, 2014-04-02, 2013-12-18, 2014-02-17, ...
$ genero <chr> "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "M", "...
$ nombre_provincia <chr> "BuenosAires", "BuenosAires", "BuenosAires", "BuenosAires", "...
$ nombre_region <chr> "Centro", "Centro", "Centro", "Centro", "Centro", "Centro", "...
$ perimetro_encefalico <dbl> 39, 41, 46, 42, 44, 46, 46, 41, 44, 45, 40, 43, NA, NA, 40, 4...
$ peso <dbl> 6.45, 7.60, 10.00, 5.25, 7.00, 8.00, 8.00, 5.05, 6.00, 7.00, ...
$ provincia_indec_id <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6...
$ talla <int> 57, 59, 69, 57, 65, 69, 70, 56, 61, 64, 62, 65, 66, 69, 55, 5...
$ var_BMIZ <dbl> 0.823701559, -0.669727507, 0.053218512, 0.009990294, 0.147864...
$ var_HAZ <dbl> -0.26902648, 0.75274675, -0.52544122, 1.68854217, 0.36600262,...
$ var_WAZ <dbl> 0.49069669, 0.03155565, -0.27268116, 0.96851504, 0.31394974, ...
$ zona_rural <chr> "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "N", "...
$ decae <chr> "False", "False", "True", "False", "False", "False", "False",...
raw_train %>%
count(perimetro_encefalico) %>%
arrange(n) %>% View
de_moivre_plot <- function(df, column) {
column <- enquo(column)
df %>%
count(!!column, decae) %>%
group_by(!!column) %>%
mutate(
prop = n / sum(n)
) %>%
filter(decae == "True") %>%
plot_ly(
x = ~n
) %>%
add_markers(
y = ~prop,
text = quo(!!column)
)
}
df_salud %>%
count(talla, decae) %>%
group_by(talla) %>%
mutate(
prop = n / sum(n)
) %>%
filter(decae == "True") %>%
plot_ly(
x = ~n
) %>%
add_markers(
y = ~prop,
text = ~talla
)
raw_train %>%
de_moivre_plot(talla)
raw_train %>%
filter(individuo %in% c(75687, 75557))
raw_train %>%
plot_ly() %>%
add_histogram(
x = ~bmi
)
as.numeric(as.Date("2013-12-13") - as.Date("2011-10-14")) / 365
[1] 2.167123
Los niños se encogen, wtf?!
df_salud %>%
arrange(individuo, fecha_control) %>%
group_by(individuo) %>%
mutate(
diff_talla = talla - lag(talla),
shrinkage = diff_talla > 11
) %>%
ungroup() %>%
filter(!is.na(diff_talla)) %>%
count(diff_talla) %>%
plot_ly(
x = ~diff_talla
) %>%
add_bars(
y = ~n
)
NA
df_salud %>%
count(fecha_nacimiento, sort = TRUE)
df_salud %>%
filter(fecha_nacimiento == '2013-09-19') %>%
arrange(fecha_control)
df_salud %>%
summarize(
min(fecha_control),
max(fecha_control),
min(fecha_nacimiento),
max(fecha_nacimiento),
max(fecha_proximo_control)
)
Copado, vemos qué tan estables son las variables para cada individuo
df_salud %>%
group_by(individuo) %>%
filter(n() == 4) %>%
summarize_all(n_distinct) %>%
summary
individuo BMIZ HAZ WAZ bmi
Min. : 22 Min. :2.000 Min. :2.000 Min. :2.000 Min. :2.000
1st Qu.:13604 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:4.000 1st Qu.:4.000
Median :28765 Median :4.000 Median :4.000 Median :4.000 Median :4.000
Mean :30833 Mean :3.999 Mean :3.999 Mean :3.999 Mean :3.957
3rd Qu.:44684 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:4.000
Max. :75688 Max. :4.000 Max. :4.000 Max. :4.000 Max. :4.000
departamento_indec_id departamento_lat departamento_long fecha_control fecha_nacimiento
Min. :1.000 Min. :1.000 Min. :1.000 Min. :2.000 Min. :1
1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:4.000 1st Qu.:1
Median :1.000 Median :1.000 Median :1.000 Median :4.000 Median :1
Mean :1.032 Mean :1.032 Mean :1.032 Mean :3.999 Mean :1
3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:4.000 3rd Qu.:1
Max. :3.000 Max. :3.000 Max. :3.000 Max. :4.000 Max. :1
fecha_proximo_control genero nombre_provincia nombre_region perimetro_encefalico
Min. :2.000 Min. :1 Min. :1.000 Min. :1.000 Min. :1.000
1st Qu.:4.000 1st Qu.:1 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:3.000
Median :4.000 Median :1 Median :1.000 Median :1.000 Median :4.000
Mean :3.999 Mean :1 Mean :1.002 Mean :1.002 Mean :3.301
3rd Qu.:4.000 3rd Qu.:1 3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:4.000
Max. :4.000 Max. :1 Max. :2.000 Max. :2.000 Max. :4.000
peso provincia_indec_id talla var_BMIZ var_HAZ var_WAZ
Min. :1.000 Min. :1.000 Min. :1.000 Min. :2 Min. :2 Min. :2
1st Qu.:4.000 1st Qu.:1.000 1st Qu.:4.000 1st Qu.:4 1st Qu.:4 1st Qu.:4
Median :4.000 Median :1.000 Median :4.000 Median :4 Median :4 Median :4
Mean :3.796 Mean :1.002 Mean :3.784 Mean :4 Mean :4 Mean :4
3rd Qu.:4.000 3rd Qu.:1.000 3rd Qu.:4.000 3rd Qu.:4 3rd Qu.:4 3rd Qu.:4
Max. :4.000 Max. :2.000 Max. :4.000 Max. :4 Max. :4 Max. :4
zona_rural decae
Min. :1.000 Min. :1.000
1st Qu.:1.000 1st Qu.:1.000
Median :1.000 Median :2.000
Mean :1.011 Mean :1.936
3rd Qu.:1.000 3rd Qu.:2.000
Max. :2.000 Max. :3.000
filter(!is.na(diff_talla), !is.na(decae)) %>%
count(shrinkage, decae) %>%
group_by(shrinkage) %>%
mutate(
prop = n / sum(n)
)
df_salud %>%
mutate(
edad = as.integer(as.numeric(fecha_control - fecha_nacimiento) / 365)
) %>%
count(edad) %>%
plot_ly(
x = ~edad
) %>%
add_bars(
y = ~n
)
df_salud